library(tidyverse)
library(dplyr)
library(lubridate)
library(ggplot2)
library(leaflet)
library(ggridges)
data <- read_csv('https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD')
summary(data)
## INCIDENT_KEY OCCUR_DATE OCCUR_TIME BORO
## Min. : 9953245 Length:28562 Length:28562 Length:28562
## 1st Qu.: 65439914 Class :character Class1:hms Class :character
## Median : 92711254 Mode :character Class2:difftime Mode :character
## Mean :127405824 Mode :numeric
## 3rd Qu.:203131993
## Max. :279758069
##
## LOC_OF_OCCUR_DESC PRECINCT JURISDICTION_CODE LOC_CLASSFCTN_DESC
## Length:28562 Min. : 1.0 Min. :0.0000 Length:28562
## Class :character 1st Qu.: 44.0 1st Qu.:0.0000 Class :character
## Mode :character Median : 67.0 Median :0.0000 Mode :character
## Mean : 65.5 Mean :0.3219
## 3rd Qu.: 81.0 3rd Qu.:0.0000
## Max. :123.0 Max. :2.0000
## NA's :2
## LOCATION_DESC STATISTICAL_MURDER_FLAG PERP_AGE_GROUP
## Length:28562 Mode :logical Length:28562
## Class :character FALSE:23036 Class :character
## Mode :character TRUE :5526 Mode :character
##
##
##
##
## PERP_SEX PERP_RACE VIC_AGE_GROUP VIC_SEX
## Length:28562 Length:28562 Length:28562 Length:28562
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## VIC_RACE X_COORD_CD Y_COORD_CD Latitude
## Length:28562 Min. : 914928 Min. :125757 Min. :40.51
## Class :character 1st Qu.:1000068 1st Qu.:182912 1st Qu.:40.67
## Mode :character Median :1007772 Median :194901 Median :40.70
## Mean :1009424 Mean :208380 Mean :40.74
## 3rd Qu.:1016807 3rd Qu.:239814 3rd Qu.:40.82
## Max. :1066815 Max. :271128 Max. :40.91
## NA's :59
## Longitude Lon_Lat
## Min. :-74.25 Length:28562
## 1st Qu.:-73.94 Class :character
## Median :-73.92 Mode :character
## Mean :-73.91
## 3rd Qu.:-73.88
## Max. :-73.70
## NA's :59
colSums(is.na(data))
## INCIDENT_KEY OCCUR_DATE OCCUR_TIME
## 0 0 0
## BORO LOC_OF_OCCUR_DESC PRECINCT
## 0 25596 0
## JURISDICTION_CODE LOC_CLASSFCTN_DESC LOCATION_DESC
## 2 25596 14977
## STATISTICAL_MURDER_FLAG PERP_AGE_GROUP PERP_SEX
## 0 9344 9310
## PERP_RACE VIC_AGE_GROUP VIC_SEX
## 9310 0 0
## VIC_RACE X_COORD_CD Y_COORD_CD
## 0 0 0
## Latitude Longitude Lon_Lat
## 59 59 59
#lets check values in JURISDICTION_CODE and determine the frequency of each
table(data$JURISDICTION_CODE)
##
## 0 1 2
## 23923 81 4556
# and same in PERP_SEX column
table(data$PERP_SEX)
##
## (null) F M U
## 1141 444 16168 1499
From the preliminary analysis, I realized that I would not be able to use the shooting location descriptions in my analysis because the dataset contains a huge number of missing values. This could be explained by the fact that the incidents with missing values simply happened on the street, but this is only an assumption. Therefore, to avoid unnecessary inaccuracies, I will not use the location descriptions in my analysis.
I also found that both the race and age characteristics of the perpetrators have missing values. A demographic analysis with so many missing values would be skewed, but I can use the fact that the perpetrator description is in the data as a binary attribute in my analysis.
Because very little information is available on demographics, this work will involve the analysis of seasonal and geographic trends. It will center on detecting how often such shooting incidents occur and their specifics. It will also analyze the variations in dynamics of such incidents across different geolocations.
The dataset was cleaned by dropping those records that had jurisdiction codes missing, assuming them to be for specific cases. (For example, those cases related to federal agencies.) I then chose the appropriate columns, converted dates and extracting year, month, day of the week, and hour. Moreover, I converted some columns into proper data types like logical and handled unknown values in columns for the race of perpetrator, as well as sex. Finally, several columns were renamed for clarity, and a logical column was added to show whether an perpetrator description is available or not.
data_clean <- data %>%
filter(!is.na(JURISDICTION_CODE)) %>%
mutate(OCCUR_DATE = mdy(OCCUR_DATE),
Year = year(OCCUR_DATE),
Month = month(OCCUR_DATE, label=TRUE),
DayOfWeek = wday(OCCUR_DATE,
week_start = getOption("lubridate.week.start", 7),
label = TRUE),
Hour = hour(OCCUR_TIME),
STATISTICAL_MURDER_FLAG = as.logical(STATISTICAL_MURDER_FLAG),
JURISDICTION_CODE = factor(JURISDICTION_CODE, levels = 0:2,
labels = c("Patrol", "Transit", "Housing")),
PERP_RACE = ifelse(PERP_RACE %in% c("(null)", "UNKNOWN"), NA, PERP_RACE),
PERP_SEX = ifelse(PERP_SEX %in% c("(null)", "U"), NA, PERP_SEX),
HasPerpDescription = ifelse(is.na(PERP_RACE) | is.na(PERP_SEX), FALSE, TRUE)
) %>%
select(c('OCCUR_DATE', 'OCCUR_TIME', 'BORO', 'STATISTICAL_MURDER_FLAG',
'PRECINCT', 'JURISDICTION_CODE', 'Latitude', 'Longitude', 'Year',
'Month', 'DayOfWeek', 'Hour', 'HasPerpDescription')) %>%
rename(Date = OCCUR_DATE, Time = OCCUR_TIME, Borough = BORO,
Precinct = PRECINCT, MurderFlag = STATISTICAL_MURDER_FLAG,
JurisdictionCode = JURISDICTION_CODE)
summary(data_clean)
## Date Time Borough MurderFlag
## Min. :2006-01-01 Length:28560 Length:28560 Mode :logical
## 1st Qu.:2009-09-04 Class1:hms Class :character FALSE:23034
## Median :2013-09-20 Class2:difftime Mode :character TRUE :5526
## Mean :2014-06-07 Mode :numeric
## 3rd Qu.:2019-09-30
## Max. :2023-12-29
##
## Precinct JurisdictionCode Latitude Longitude
## Min. : 1.0 Patrol :23923 Min. :40.51 Min. :-74.25
## 1st Qu.: 44.0 Transit: 81 1st Qu.:40.67 1st Qu.:-73.94
## Median : 67.0 Housing: 4556 Median :40.70 Median :-73.92
## Mean : 65.5 Mean :40.74 Mean :-73.91
## 3rd Qu.: 81.0 3rd Qu.:40.82 3rd Qu.:-73.88
## Max. :123.0 Max. :40.91 Max. :-73.70
## NA's :59 NA's :59
## Year Month DayOfWeek Hour HasPerpDescription
## Min. :2006 Jul : 3389 Sun:5669 Min. : 0.00 Mode :logical
## 1st Qu.:2009 Aug : 3264 Mon:4062 1st Qu.: 3.00 FALSE:12308
## Median :2013 Jun : 2959 Tue:3331 Median :15.00 TRUE :16252
## Mean :2014 May : 2682 Wed:3145 Mean :12.27
## 3rd Qu.:2019 Sep : 2677 Thu:3169 3rd Qu.:20.00
## Max. :2023 Oct : 2378 Fri:3758 Max. :23.00
## (Other):11211 Sat:5426
Then I shall go ahead with a basic temporal and spatial analysis of the data, considering how the numbers vary over the years and by time of the day and how these incidents are distributed across New York City boroughs. Also, I intend to look at what has happened to the number of murders relative to the incidents and compute an estimate for the proportion of perpetrators for whom some basic descriptive information is available.
data_clean %>%
group_by(Year) %>%
summarize(Incidents = n(), Murdered = sum(MurderFlag), .groups = "drop")%>%
ggplot(aes(x = Year)) +
geom_line(aes(y = Incidents, color = "Shooting")) +
geom_line(aes(y = Murdered, color = "Murder")) +
geom_point(aes(y = Incidents, color = "Shooting")) +
geom_point(aes(y = Murdered, color = "Murder")) +
scale_color_manual(values = c("Shooting" = "black", "Murder" = "red")) +
labs(title = "Incidents by Year",
x = "Year",
y = "Number of Incidents",
color = "Legend") +
theme_minimal()
data_clean %>% group_by(Year) %>%
summarize(Murdered = mean(MurderFlag, na.rm = TRUE),
Descripted = mean(HasPerpDescription, na.rm = TRUE)) %>%
ggplot(aes(x = Year)) +
geom_line(aes(y = Descripted, color = "Descripted")) +
geom_line(aes(y = Murdered, color = "Murder")) +
geom_point(aes(y = Descripted, color = "Descripted")) +
geom_point(aes(y = Murdered, color = "Murder")) +
scale_color_manual(values = c("Descripted" = "gray", "Murder" = "red")) +
scale_y_continuous(labels = scales::percent) +
labs(title = "Murder Rates and Perpetrator Descriptions by Year",
x = "Year",
y = "Rate",
color = "Legend") +
theme_minimal()
data_clean %>%
group_by(Borough, Year) %>%
summarize(Incidents = n(), .groups = "drop") %>%
ggplot(aes(x = Year, y = Incidents, color=Borough)) +
geom_line() +
geom_point() +
labs(title = "Incidents by Borough by Year",
x = "Year",
y = "Number of Incidents") +
theme_minimal()
Analysis A temporal analysis of gun-crimes committed in various New York City areas revealed certain patterns. For instance, the pattern of such crimes committed prior to 2016 was negative, but a significant increase was found thereafter. At the same time, a positive correlation was found between the number of homicides and the number of crimes with weapons. A similar pattern was observed in the same analysis in the context of different city districts for several years. However, another interesting correlation was found - with the steady increase in the number of such crimes, the descriptive nature of criminals’ identity was highly variable, which may suggest a deficiency in data collection methods and record keeping directly by New York City law enforcement agencies.
Bias There is a strategy in bias when comparing data by Borough and the absolute numbers presented on each graph do not account for how many people live in the 5 boroughs. Naturally, the Bronx and Brooklyn high numbers come in part by virtue that these are two heavily populated areas. If you do not take this information into account, your conclusions will be biased because places that are more densely populated always face higher incidents in absolute numbers.
hourly_data <- data_clean %>%
mutate(
Time_Rounded = round_date(as.POSIXct(Time, origin = "1970-01-01", tz = "UTC"),
unit = "10 minutes"),
Time_Decimal = hour(Time_Rounded) + minute(Time_Rounded) / 60)
hourly_data %>%
group_by(Time_Decimal, Borough) %>%
summarize(Incidents = n(), .groups = "drop") %>%
ggplot(aes(x = Time_Decimal, y = Incidents, color=Borough)) +
geom_point() +
scale_x_continuous(breaks = seq(0, 23, by = 1), labels = sprintf("%02d:00", 0:23)) +
labs(title = "Incidents by Borough Hourly",
x = "Hour",
y = "Number of Incidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
hourly_data %>%
group_by(Time_Decimal, DayOfWeek) %>%
summarize(Incidents = n(), .groups = "drop") %>%
ggplot(aes(x = Time_Decimal, y = Incidents, color=DayOfWeek)) +
geom_point() +
scale_x_continuous(breaks = seq(0, 23, by = 1), labels = sprintf("%02d:00", 0:23)) +
labs(title = "Incidents by DayOfWeek Hourly",
x = "Hour",
y = "Number of Incidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
data_clean %>%
group_by(Month, Hour) %>%
summarize(Incidents = n()
, .groups = "drop") %>%
ggplot(aes(x = Month, y = Hour, fill = Incidents)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
scale_y_continuous(breaks = seq(0, 24, by = 4), labels = sprintf("%02d:00", seq(0, 24, by = 4))) +
labs(title = "Month / Hour incidents heatmap",
x = "Month",
y = "Hour",
fill = "Number of Incidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
data_clean %>%
group_by(Month, DayOfWeek) %>%
summarize(Incidents = n()
, .groups = "drop") %>%
ggplot(aes(x = Month, y = DayOfWeek, fill = Incidents)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Month / Day of Week incidents heatmap",
x = "Month",
y = "Day of Week",
fill = "Number of Incidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
hourly_data %>%
group_by(Time_Decimal, JurisdictionCode) %>%
summarize(Incidents = n(), .groups = "drop") %>%
ggplot(aes(x = Time_Decimal, y = Incidents, color=JurisdictionCode)) +
geom_point() +
scale_x_continuous(breaks = seq(0, 23, by = 1), labels = sprintf("%02d:00", 0:23)) +
labs(title = "Incidents by Jurisdiction Hourly",
x = "Hour",
y = "Number of Incidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
data_clean %>% group_by(Hour) %>%
summarize(descripted = mean(HasPerpDescription, na.rm = TRUE)) %>%
ggplot(aes(x = Hour, y = descripted)) +
geom_line() +
geom_point() +
labs(title = "Perpetrator Descriptions Rate Hourly",
x = "Hour",
y = "Rate") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(breaks = seq(0, 23, by = 1), labels = sprintf("%02d:00", 0:23)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Analysis A similar analysis of weapon crime patterns in New York City by specific time period also revealed additional correlations. On weekdays, the absolute maximum occurs during the following time periods: from midnight to 5 AM and from 6 PM to midnight. The highest number of crimes committed during these periods is in Brooklyn and Bronx. At the same time, the maximum number of crimes occurs on Friday, Saturday and Sunday at night. Extra attention should be paid to an additional variable - seasonality. For instance, during the summer months, similar patterns persist, but the number of crimes committed in kind is much higher.
A significant gap identified earlier was the lack of a criminal identity description as well as age and gender in some cases. Analysis has shown that in most cases this absence is due to the time of the crime. For instance, if it was committed during daytime hours, such description is highly accurate, possibly indicating that identification is easier during daytime hours.
data_clean %>% group_by(Borough) %>%
summarize(share_of_incidents = n(),murder_rate = mean(MurderFlag, na.rm = TRUE)) %>%
mutate(share_of_incidents = (share_of_incidents / sum(share_of_incidents))) %>%
arrange(desc(share_of_incidents)) %>%
mutate(pos = cumsum(share_of_incidents) - share_of_incidents / 2) %>%
ggplot(aes(x = 2, y = share_of_incidents, fill = murder_rate)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar(theta = "y") +
scale_fill_gradient(low = "blue", high = "red",labels = scales::percent) +
labs(title = "Incidents and Murder Rates by Borough",
x = NULL, y = NULL, fill = "Murder Rate") +
theme_void() +
theme(legend.position = "right") +
geom_text(aes(y = pos, label = Borough), color = "black", size = 3.5)
data_clean %>% group_by(Precinct) %>%
summarize(
HasDescribed = mean(HasPerpDescription),
Murder = mean(MurderFlag),
.groups = "drop") %>%
pivot_longer(cols = c("HasDescribed","Murder"),
names_to = "Variable", values_to = "Value") %>%
ggplot( aes(x = Value, y = Variable, fill = Variable)) +
geom_density_ridges(bandwidth = 0.02) +
scale_x_continuous(labels = scales::percent, limits = c(0, 1)) +
labs(title = "Distribution of Murder and Perp. Description Probabilities by Precinct",
x = "Probability Percentage",
y = ""
) +
theme_minimal() +
theme(legend.position = "none")
Analysis Firearms crimes’ analysis by various New York City areas concluded that most of them occur in the two most populous ones (Brooklyn and the Bronx). However, the number of such crimes within Manhattan and Staten Island is significantly lower. When these crimes are considered in the context of their severity, the majority of homicides were committed within Staten Island. It follows that the direct number of these crimes may be significantly lower in some areas, but their severity (e.g., homicides number) would be higher relative to other ones. There was also the additional conclusion that in 20-25% of cases, the use of a firearm directly resulted in a homicide, but only in 40-60% of those was the criminal’ identity and description identified. However, the latter variable can vary significantly from area to area, which may reflect the varying approaches of law enforcement agencies to investigating crimes and working with witnesses.
data_geo <- data_clean %>% filter(!is.na(Longitude) & !is.na(Latitude))
map <- leaflet(width = "503px", height = "700px") %>%
addTiles() %>%
setView(lat = 40.75, lng = -73.93, zoom = 11)
map %>%
addCircleMarkers(data = data_geo %>% filter(!MurderFlag),
lng = ~Longitude, lat = ~Latitude,
radius = 2,
fillColor = "black",
stroke = FALSE) %>%
addCircleMarkers(data = data_geo %>% filter(MurderFlag),
lng = ~Longitude, lat = ~Latitude,
radius = 2,
fillColor = "red",
stroke = FALSE) %>%
addLegend(position = "bottomright",
title = "Locations of Incidents",
colors = c("red", "black"),
labels = c("Incidents resulting in murder", "Incidents without murder"),
opacity = 1,
labFormat = labelFormat(prefix = ""),
values = c(0, 100))
map <- leaflet(width = "503px", height = "700px") %>%
addTiles() %>%
setView(lat = 40.75, lng = -73.93, zoom = 11) #%>%
map %>%
addCircleMarkers(data = data_geo %>% filter(JurisdictionCode=="Patrol"),
lng = ~Longitude, lat = ~Latitude,
radius = 2,
fillColor = "black",
stroke = FALSE) %>%
addCircleMarkers(data = data_geo %>% filter(JurisdictionCode=="Housing"),
lng = ~Longitude, lat = ~Latitude,
radius = 2,
fillColor = "red",
stroke = FALSE) %>%
addCircleMarkers(data = data_geo %>% filter(JurisdictionCode=="Transit"),
lng = ~Longitude, lat = ~Latitude,
radius = 2,
fillColor = "blue",
stroke = FALSE) %>%
addLegend(position = "bottomright",
title = "Locations of Incidents by Jurisdiction",
colors = c("black", "red", "blue"),
labels = c("Patrol", "Housing", "Transit"),
opacity = 1,
labFormat = labelFormat(prefix = ""),
values = c(0, 100))
Analysis Spatial analysis reveals meaningful patterns in the geographical location of shooting incidents. These high-density clusters of both fatal and non-fatal incidents are concentrated mostly in the Bronx, northern Manhattan and central Brooklyn. The proportionality between the count of fatal incidents means the level of fatalities is proportional to high local activity areas.
As seen on the jurisdictional map, most incidents are handled by patrol jurisdiction when housing jurisdiction is forming well defined clusters. Why these clusters form is harder to say, whether it be due to incidents there happening more often in buildings or because other sectors are under patrol jurisdiction. Transit-related incidents are much less frequent.
Model description Obviously, the correlation between time of day and number of incidents does not seem to be linear, so I used a polynomial function for the model. Overall, the resulting model does a good job of simulating the average number of incidents, given the wide variation across boroughs.
hourly_data <- hourly_data %>%
group_by(Time_Decimal, Borough) %>%
summarize(Incidents = n(), .groups = "drop")
ggplot(hourly_data, aes(x = Time_Decimal, y = Incidents)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red") +
scale_x_continuous(breaks = seq(0, 23, by = 1), labels = sprintf("%02d:00", 0:23)) +
labs(title = "Incidents by Borough Hourly (Polynomial Regression Model)",
x = "Hour",
y = "Number of Incidents") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
summary(lm(Incidents ~ poly(Time_Decimal, 2), data = hourly_data))
##
## Call:
## lm(formula = Incidents ~ poly(Time_Decimal, 2), data = hourly_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -87.034 -17.910 -6.923 18.927 109.075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.51 1.23 32.931 < 2e-16 ***
## poly(Time_Decimal, 2)1 93.61 32.66 2.866 0.00428 **
## poly(Time_Decimal, 2)2 593.53 32.66 18.171 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.66 on 702 degrees of freedom
## Multiple R-squared: 0.3253, Adjusted R-squared: 0.3233
## F-statistic: 169.2 on 2 and 702 DF, p-value: < 2.2e-16
As mentioned previously I did not normalize the number of incidents for each neighborhood; instead it’s for absolute numbers, which might affect my conclusions. Moreover, there are a good amount of incidents where perpetrators is not designated and this can lead to false conclusions as far as which populations these types of incidences break down amongst.
In addition, these data are collected using certain procedures or from certain sources, this may lead to systematic errors. Some victims, like in gang-related shootings where the law of silence is strictly obeyed, may not go to an official for medical treatment or may refuse to file a report with police.
Some factors of crime such as socioeconomic are not cover by the data. For instance, certain neighborhoods with high crime rates could relate to poor education or sought after a higher unemployment rate.